# 请根据 examples/README.md 下 “数据集准备及处理” 章节下载 Alpaca 数据集
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh 

DATASET_PATH=/opt/local/datasets/BAAI/MindSpeed-Infinity-Instruct-7M
TOKENIZER_PATH=./model_from_hf/Llama-2-7b-hf/  #tokenizer path
mkdir -p ./finetune_dataset/MindSpeed-Infinity-Instruct-7M


    # --overwrite-cache \
    # --n-subs 8 \

python ./preprocess_data.py \
    --input ${DATASET_PATH} \
    --tokenizer-name-or-path ${TOKENIZER_PATH} \
    --output-prefix ./finetune_dataset/MindSpeed-Infinity-Instruct-7M/MindSpeed-Infinity-Instruct-7M \
    --workers 16 \
    --log-interval 1000 \
    --tokenizer-type PretrainedFromHF \
    --cache-dir /opt/local/mindspeed_cache \
    --handler-name AlpacaStyleInstructionHandler \
    --prompt-type llama2 \

    # --map-keys '{"history": "history"}'
    # --map-keys '{"prompt":"instruction","query":"input","response":"output"}' # 默认值，可不传
